1 Effect of UPSTM-Based Decorrelation on Feature Discovery

1.0.1 Loading the libraries

library("FRESA.CAD")
library(readxl)
library(igraph)
library(umap)
library(tsne)
library(entropy)

op <- par(no.readonly = TRUE)
pander::panderOptions('digits', 3)
pander::panderOptions('table.split.table', 400)
pander::panderOptions('keep.trailing.zeros',TRUE)

1.1 Material and Methods

Data Source https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6960825/

Mohino-Herranz I, Gil-Pita R, Rosa-Zurera M, Seoane F. Activity Recognition Using Wearable Physiological Measurements: Selection of Features from a Comprehensive Literature Study. Sensors (Basel). 2019 Dec 13;19(24):0. doi: 10.3390/s19245524. PMID: 31847261; PMCID: PMC6960825.

1.2 The Data

Activitydata <- read.csv("~/GitHub/LatentBiomarkers/Data/ActivityData/data.txt", header=FALSE, stringsAsFactors=TRUE)
featNames <- read.table("~/GitHub/LatentBiomarkers/Data/ActivityData/Featurelabels.txt", quote="\"", comment.char="")
featNames <- as.character(t(featNames))
featNames <- str_replace_all(featNames,"\\(abs\\)","_A_")
featNames[duplicated(featNames)] <- paste(featNames[duplicated(featNames)],"D",sep="_")

rep_ID <- numeric(nrow(Activitydata))
ctr <- 1
for (ind in c(1:(nrow(Activitydata)-1)))
{
  rep_ID[ind] <- ctr
  if (Activitydata$V1[ind] != Activitydata$V1[ind+1]) ctr <- 0;
  ctr <- ctr + 1
}
rownames(Activitydata) <- paste(Activitydata$V1,rep_ID,sep="_")
colnames(Activitydata) <- c("ID",featNames,"class")
Activitydata$rep <- rep_ID
  
tb <- table(Activitydata$class)

classes <- c("Neu","Emo","Men","Phy")
names(classes) <- names(tb)
Activitydata$class <- classes[as.character(Activitydata$class)]
table(Activitydata$class)
#> 
#>  Emo  Men  Neu  Phy 
#> 1120 1120 1120 1120



ID_class <- paste(Activitydata$ID,Activitydata$class)
IDCLASS <- unique(ID_class)
theclass <- Activitydata$class[!duplicated(ID_class)]
theIDs <- Activitydata$ID[!duplicated(ID_class)]

ActivitydataAvg <- NULL
for (id in IDCLASS)
{
  ActivitydataAvg <- rbind(ActivitydataAvg,apply(Activitydata[ID_class==id,featNames],2,mean))
}
colnames(ActivitydataAvg) <- featNames
rownames(ActivitydataAvg) <- IDCLASS
ActivitydataAvg <- as.data.frame(ActivitydataAvg)
ActivitydataAvg$class <- theclass
ActivitydataAvg$ID <- theIDs

table(ActivitydataAvg$class)
#> 
#> Emo Men Neu Phy 
#>  40  40  40  40


ActivitydataAvg <- subset(ActivitydataAvg, class=="Men" | class=="Emo")

ActivitydataAvg$class <- 1*(ActivitydataAvg$class == "Men")
table(ActivitydataAvg$class)
#> 
#>  0  1 
#> 40 40

1.2.0.1 Standarize the names for the reporting

studyName <- "Activity"
dataframe <- ActivitydataAvg
outcome <- "class"

TopVariables <- 10
thro <- 0.80
cexheat = 0.15

1.3 Generaring the report

1.3.1 Libraries

Some libraries

library(psych)
library(whitening)
library("vioplot")

1.3.2 Data specs

pander::pander(c(rows=nrow(dataframe),col=ncol(dataframe)-1))
rows col
80 534
pander::pander(table(dataframe[,outcome]))
0 1
40 40

varlist <- colnames(dataframe)
varlist <- varlist[varlist != outcome]

largeSet <- length(varlist) > 1000

1.3.3 Scaling the data

Scaling and removing near zero variance columns and highly co-linear(r>0.99999) columns


  ### Some global cleaning
  sdiszero <- apply(dataframe,2,sd) > 1.0e-16
  dataframe <- dataframe[,sdiszero]

  varlist <- colnames(dataframe)[colnames(dataframe) != outcome]
  tokeep <- c(as.character(correlated_Remove(dataframe,varlist,thr=0.99999)),outcome)
  dataframe <- dataframe[,tokeep]

  varlist <- colnames(dataframe)
  varlist <- varlist[varlist != outcome]


dataframe <- FRESAScale(dataframe,method="OrderLogit")$scaledData

1.4 The heatmap of the data


if (!largeSet)
{
  
  hm <- heatMaps(data=dataframe,
                 Outcome=outcome,
                 Scale=TRUE,
                 hCluster = "row",
                 xlab="Feature",
                 ylab="Sample",
                 srtCol=45,
                 srtRow=45,
                 cexCol=cexheat,
                 cexRow=cexheat
                 )
  par(op)
}

1.4.0.1 Correlation Matrix of the Data

The heat map of the data


if (!largeSet)
{

  par(cex=0.6,cex.main=0.85,cex.axis=0.7)
  #cormat <- Rfast::cora(as.matrix(dataframe[,varlist]),large=TRUE)
  cormat <- cor(dataframe[,varlist],method="pearson")
  cormat[is.na(cormat)] <- 0
  gplots::heatmap.2(abs(cormat),
                    trace = "none",
  #                  scale = "row",
                    mar = c(5,5),
                    col=rev(heat.colors(5)),
                    main = "Original Correlation",
                    cexRow = cexheat,
                    cexCol = cexheat,
                     srtCol=45,
                     srtRow=45,
                    key.title=NA,
                    key.xlab="Pearson Correlation",
                    xlab="Feature", ylab="Feature")
  diag(cormat) <- 0
  print(max(abs(cormat)))
}

[1] 1

1.5 The decorrelation


DEdataframe <- IDeA(dataframe,verbose=TRUE,thr=thro)
#> 
#>  Included: 362 , Uni p: 0.01628202 , Uncorrelated Base: 62 , Outcome-Driven Size: 0 , Base Size: 62 
#> 
#> 
 1 <R=1.000,w=  1,N=  265>, Top: 47( 21 )[ 1 : 47 Fa= 47 : 0.975 ]( 47 , 183 , 0 ),<|>Tot Used: 230 , Added: 183 , Zero Std: 0 , Max Cor: 1.000
#> 
 2 <R=1.000,w=  1,N=  265>, Top: 28( 6 )[ 1 : 28 Fa= 74 : 0.975 ]( 27 , 94 , 47 ),<|>Tot Used: 258 , Added: 94 , Zero Std: 0 , Max Cor: 1.000
#> 
 3 <R=1.000,w=  1,N=  265>, Top: 10( 2 )[ 1 : 10 Fa= 84 : 0.975 ]( 10 , 20 , 74 ),<|>Tot Used: 258 , Added: 20 , Zero Std: 0 , Max Cor: 1.000
#> 
 4 <R=1.000,w=  2,N=   92>, Top: 37( 1 )[ 1 : 37 Fa= 99 : 0.950 ]( 37 , 45 , 84 ),<|>Tot Used: 280 , Added: 45 , Zero Std: 0 , Max Cor: 0.992
#> 
 5 <R=0.992,w=  2,N=   92>, Top: 10( 1 )[ 1 : 10 Fa= 100 : 0.946 ]( 10 , 10 , 99 ),<|>Tot Used: 280 , Added: 10 , Zero Std: 0 , Max Cor: 0.946
#> 
 6 <R=0.946,w=  2,N=   92>, Top: 19( 1 )[ 1 : 19 Fa= 107 : 0.923 ]( 19 , 20 , 100 ),<|>Tot Used: 289 , Added: 20 , Zero Std: 0 , Max Cor: 0.983
#> 
 7 <R=0.983,w=  2,N=   92>, Top: 1( 1 )[ 1 : 1 Fa= 107 : 0.941 ]( 1 , 1 , 107 ),<|>Tot Used: 289 , Added: 1 , Zero Std: 0 , Max Cor: 0.930
#> 
 8 <R=0.930,w=  3,N=   85>, Top: 31( 2 )[ 1 : 31 Fa= 118 : 0.865 ]( 31 , 38 , 107 ),<|>Tot Used: 303 , Added: 38 , Zero Std: 0 , Max Cor: 0.983
#> 
 9 <R=0.983,w=  3,N=   85>, Top: 10( 1 )[ 1 : 10 Fa= 119 : 0.892 ]( 8 , 10 , 118 ),<|>Tot Used: 307 , Added: 10 , Zero Std: 0 , Max Cor: 0.954
#> 
 10 <R=0.954,w=  3,N=   85>, Top: 2( 1 )[ 1 : 2 Fa= 121 : 0.877 ]( 2 , 2 , 119 ),<|>Tot Used: 307 , Added: 2 , Zero Std: 0 , Max Cor: 0.963
#> 
 11 <R=0.963,w=  4,N=   62>, Top: 29( 2 )[ 1 : 29 Fa= 131 : 0.832 ]( 27 , 30 , 121 ),<|>Tot Used: 312 , Added: 30 , Zero Std: 0 , Max Cor: 0.891
#> 
 12 <R=0.891,w=  4,N=   62>, Top: 25( 1 )[ 1 : 25 Fa= 136 : 0.800 ]( 24 , 27 , 131 ),<|>Tot Used: 317 , Added: 27 , Zero Std: 0 , Max Cor: 0.988
#> 
 13 <R=0.988,w=  4,N=   62>, Top: 2( 1 )[ 1 : 2 Fa= 138 : 0.844 ]( 2 , 2 , 136 ),<|>Tot Used: 317 , Added: 2 , Zero Std: 0 , Max Cor: 0.981
#> 
 14 <R=0.981,w=  5,N=    6>, Top: 3( 1 )[ 1 : 3 Fa= 139 : 0.800 ]( 3 , 3 , 138 ),<|>Tot Used: 317 , Added: 3 , Zero Std: 0 , Max Cor: 0.804
#> 
 15 <R=0.804,w=  5,N=    6>, Top: 1( 1 )[ 1 : 1 Fa= 140 : 0.800 ]( 1 , 1 , 139 ),<|>Tot Used: 317 , Added: 1 , Zero Std: 0 , Max Cor: 0.798
#> 
 16 <R=0.798,w=  6,N=    0>
#> 
 [ 16 ], 0.7983933 Decor Dimension: 317 . Cor to Base: 195 , ABase: 31 , Outcome Base: 0 
#> 
varlistc <- colnames(DEdataframe)[colnames(DEdataframe) != outcome]

pander::pander(sum(apply(dataframe[,varlist],2,var)))

616

pander::pander(sum(apply(DEdataframe[,varlistc],2,var)))

158

pander::pander(entropy(discretize(unlist(dataframe[,varlist]), 256)))

4.84

pander::pander(entropy(discretize(unlist(DEdataframe[,varlistc]), 256)))

3.12

1.5.1 The decorrelation matrix


if (!largeSet)
{

  par(cex=0.6,cex.main=0.85,cex.axis=0.7)
  
  UPSTM <- attr(DEdataframe,"UPSTM")
  
  gplots::heatmap.2(1.0*(abs(UPSTM)>0),
                    trace = "none",
                    mar = c(5,5),
                    col=rev(heat.colors(5)),
                    main = "Decorrelation matrix",
                    cexRow = cexheat,
                    cexCol = cexheat,
                   srtCol=45,
                   srtRow=45,
                    key.title=NA,
                    key.xlab="|Beta|>0",
                    xlab="Output Feature", ylab="Input Feature")
  
  par(op)
}

1.6 The heatmap of the decorrelated data

if (!largeSet)
{

  hm <- heatMaps(data=DEdataframe,
                 Outcome=outcome,
                 Scale=TRUE,
                 hCluster = "row",
                 cexRow = cexheat,
                 cexCol = cexheat,
                 srtCol=45,
                 srtRow=45,
                 xlab="Feature",
                 ylab="Sample")
  par(op)
}

1.7 The correlation matrix after decorrelation

if (!largeSet)
{

  cormat <- cor(DEdataframe[,varlistc],method="pearson")
  cormat[is.na(cormat)] <- 0
  
  gplots::heatmap.2(abs(cormat),
                    trace = "none",
                    mar = c(5,5),
                    col=rev(heat.colors(5)),
                    main = "Correlation after IDeA",
                    cexRow = cexheat,
                    cexCol = cexheat,
                     srtCol=45,
                     srtRow=45,
                    key.title=NA,
                    key.xlab="Pearson Correlation",
                    xlab="Feature", ylab="Feature")
  
  par(op)
  diag(cormat) <- 0
  print(max(abs(cormat)))
}

[1] 0.9976649

1.8 U-MAP Visualization of features

1.8.1 The UMAP based on LASSO on Raw Data

classes <- unique(dataframe[,outcome])
raincolors <- rainbow(length(classes))
names(raincolors) <- classes
datasetframe.umap = umap(scale(dataframe[,varlist]),n_components=2)
plot(datasetframe.umap$layout,xlab="U1",ylab="U2",main="UMAP: Original",t='n')
text(datasetframe.umap$layout,labels=dataframe[,outcome],col=raincolors[dataframe[,outcome]+1])

1.8.2 The decorralted UMAP


datasetframe.umap = umap(scale(DEdataframe[,varlistc]),n_components=2)
plot(datasetframe.umap$layout,xlab="U1",ylab="U2",main="UMAP: After IDeA",t='n')
text(datasetframe.umap$layout,labels=DEdataframe[,outcome],col=raincolors[DEdataframe[,outcome]+1])

1.9 Univariate Analysis

1.9.1 Univariate



univarRAW <- uniRankVar(varlist,
               paste(outcome,"~1"),
               outcome,
               dataframe,
               rankingTest="AUC")

100 : ECG_p_LF_mean 200 : IT_CCV_LF 300 : EDA_Original_mad_D




univarDe <- uniRankVar(varlistc,
               paste(outcome,"~1"),
               outcome,
               DEdataframe,
               rankingTest="AUC",
               )

100 : La_ECG_p_LF_mean 200 : La_IT_CCV_LF 300 : La_EDA_Original_mad_D

1.9.2 Final Table


univariate_columns <- c("caseMean","caseStd","controlMean","controlStd","controlKSP","ROCAUC")

##topfive
topvar <- c(1:length(varlist)) <= TopVariables
pander::pander(univarRAW$orderframe[topvar,univariate_columns])
  caseMean caseStd controlMean controlStd controlKSP ROCAUC
ECG_hrv_prctile75 0.483 1.031 -0.1471 1.286 0.0849 0.731
ECG_hrv_geomean_A_ -0.105 1.208 0.5327 1.093 0.1001 0.727
IT_LF_baseline_D 0.518 1.044 -0.2219 0.700 0.5845 0.721
IT_p_Total_baseline 0.507 1.009 -0.2168 0.668 0.6797 0.721
IT_VLF_baseline 0.492 0.990 -0.2217 0.652 0.7285 0.720
ECG_hrv_prctile25 0.244 1.150 -0.4451 0.938 0.5416 0.719
IT_PSD_baseline 0.554 1.095 -0.1829 0.773 0.2875 0.715
ECG_hrv_mean 0.259 0.865 -0.3517 0.931 0.2559 0.714
IT_HF_baseline 0.692 1.254 -0.0466 1.052 0.0555 0.713
ECG_hrv_trimmean25 0.282 0.950 -0.3372 0.967 0.5083 0.711


topLAvar <- univarDe$orderframe$Name[str_detect(univarDe$orderframe$Name,"La_")]
topLAvar <- unique(c(univarDe$orderframe$Name[topvar],topLAvar[1:as.integer(TopVariables/2)]))
finalTable <- univarDe$orderframe[topLAvar,univariate_columns]

theLaVar <- rownames(finalTable)[str_detect(rownames(finalTable),"La_")]

pander::pander(univarDe$orderframe[topLAvar,univariate_columns])
  caseMean caseStd controlMean controlStd controlKSP ROCAUC
La_IT_BRV_baseline -0.15629 0.4237 0.1964 0.2851 0.9649 0.759
**La_EDA_processed_geomean_A__D** 0.29051 0.5653 -0.2026 0.4549 0.7895 0.742
La_EDA_Filt1_mad_D -0.01455 0.0533 0.0233 0.0324 0.3238 0.733
La_ECG_HR_min_div_baseline 0.06680 0.4682 0.2872 0.4039 0.0950 0.731
La_ECG_RR_window_baseline 0.14303 0.2844 -0.0680 0.2889 0.9594 0.729
La_EDA_processed_std_D 0.24649 0.3816 -0.0880 0.3304 0.1748 0.724
IT_p_Total_baseline 0.50669 1.0086 -0.2168 0.6684 0.6797 0.721
La_EDA_Original_max_D 0.00529 0.0520 -0.0230 0.0302 0.0128 0.716
ECG_hrv_mean 0.25862 0.8655 -0.3517 0.9306 0.2559 0.714
La_EDA_processed_trimmean25_D 0.21468 0.6566 -0.1355 0.3130 0.4613 0.710

dc <- getLatentCoefficients(DEdataframe)
fscores <- attr(DEdataframe,"fscore")

theSigDc <- dc[theLaVar]
names(theSigDc) <- NULL
theSigDc <- unique(names(unlist(theSigDc)))


theFormulas <- dc[rownames(finalTable)]
deFromula <- character(length(theFormulas))
names(deFromula) <- rownames(finalTable)

pander::pander(c(mean=mean(sapply(dc,length)),total=length(dc),fraction=length(dc)/(ncol(dataframe)-1)))
mean total fraction
2.85 281 0.77


allSigvars <- names(dc)



dx <- names(deFromula)[1]
for (dx in names(deFromula))
{
  coef <- theFormulas[[dx]]
  cname <- names(theFormulas[[dx]])
  names(cname) <- cname
  for (cf in names(coef))
  {
    if (cf != dx)
    {
      if (coef[cf]>0)
      {
        deFromula[dx] <- paste(deFromula[dx],
                               sprintf("+ %5.3f*%s",coef[cf],cname[cf]))
      }
      else
      {
        deFromula[dx] <- paste(deFromula[dx],
                               sprintf("%5.3f*%s",coef[cf],cname[cf]))
      }
    }
  }
}

finalTable <- rbind(finalTable,univarRAW$orderframe[theSigDc[!(theSigDc %in% rownames(finalTable))],univariate_columns])


orgnamez <- rownames(finalTable)
orgnamez <- str_remove_all(orgnamez,"La_")
finalTable$RAWAUC <- univarRAW$orderframe[orgnamez,"ROCAUC"]
finalTable$DecorFormula <- deFromula[rownames(finalTable)]
finalTable$fscores <- fscores[rownames(finalTable)]

Final_Columns <- c("DecorFormula","caseMean","caseStd","controlMean","controlStd","controlKSP","ROCAUC","RAWAUC","fscores")

finalTable <- finalTable[order(-finalTable$ROCAUC),]
pander::pander(finalTable[,Final_Columns])
  DecorFormula caseMean caseStd controlMean controlStd controlKSP ROCAUC RAWAUC fscores
La_IT_BRV_baseline -0.832IT_BRV_prctile25 + 1.000IT_BRV_baseline -0.15629 0.4237 0.19644 0.2851 0.96495 0.759 0.642 -1
**La_EDA_processed_geomean_A__D** + 1.000*EDA_processed_geomean_A__D -0.977*EDA_Filt2_std_D 0.29051 0.5653 -0.20256 0.4549 0.78945 0.742 0.601 2
La_EDA_Filt1_mad_D + 1.000EDA_Filt1_mad_D -1.026EDA_Filt2_std_D -0.01455 0.0533 0.02326 0.0324 0.32376 0.733 0.508 0
La_ECG_HR_min_div_baseline + 0.912ECG_RR_window_geomean_A_ + 1.000ECG_HR_min_div_baseline 0.06680 0.4682 0.28718 0.4039 0.09500 0.731 0.518 -1
La_ECG_RR_window_baseline -0.914ECG_RR_window_geomean_A_ + 1.000ECG_RR_window_baseline 0.14303 0.2844 -0.06795 0.2889 0.95939 0.729 0.522 -1
La_EDA_processed_std_D + 1.000EDA_processed_std_D -0.890EDA_Filt2_std_D 0.24649 0.3816 -0.08800 0.3304 0.17475 0.724 0.581 1
IT_p_Total_baseline 0.50669 1.0086 -0.21676 0.6684 0.67972 0.721 0.721 5
La_EDA_Original_max_D + 1.000EDA_Original_max_D -1.797EDA_Original_prctile75_D + 0.805*EDA_Filt1_prctile25_D 0.00529 0.0520 -0.02299 0.0302 0.01285 0.716 0.575 -2
ECG_hrv_mean 0.25862 0.8655 -0.35169 0.9306 0.25586 0.714 0.714 4
La_EDA_processed_trimmean25_D + 1.000EDA_processed_trimmean25_D -0.932EDA_processed_median_D 0.21468 0.6566 -0.13552 0.3130 0.46134 0.710 0.582 -1
IT_BRV_baseline NA -0.35971 0.8468 -0.00106 1.0856 0.08274 0.642 0.642 NA
ECG_RR_window_geomean_A_ NA -0.09206 0.9615 0.20263 0.9615 0.99561 0.603 0.603 15
**EDA_processed_geomean_A__D** NA 0.68903 1.3691 0.21447 1.0251 0.05106 0.601 0.601 NA
EDA_processed_trimmean25_D NA -0.39525 1.3811 -0.61752 1.3648 0.01157 0.582 0.582 NA
EDA_processed_std_D NA 0.60948 1.1881 0.29184 0.9118 0.01488 0.581 0.581 NA
EDA_Original_max_D NA 0.63696 1.3434 0.36718 1.1892 0.00726 0.575 0.575 NA
EDA_Filt1_prctile25_D NA 0.66089 1.4082 0.37552 1.2296 0.00966 0.573 0.573 34
EDA_Original_prctile75_D NA 0.64742 1.3754 0.38526 1.2143 0.01041 0.570 0.570 NA
EDA_processed_median_D NA -0.65445 1.3990 -0.51718 1.3120 0.00378 0.530 0.530 NA
ECG_RR_window_baseline NA 0.05886 0.9529 0.11732 0.9276 0.77168 0.522 0.522 NA
ECG_HR_min_div_baseline NA 0.15072 1.0047 0.10247 0.9849 0.67448 0.518 0.518 NA
EDA_Filt1_mad_D NA 0.40391 1.0951 0.46113 1.1993 0.01682 0.508 0.508 NA
IT_BRV_prctile25 NA -0.24439 1.0856 -0.23727 1.0882 0.27464 0.503 0.503 5
EDA_Filt2_std_D NA 0.40783 1.0812 0.42676 1.1548 0.02041 0.489 0.489 11